library(car)
## Loading required package: carData
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(readxl)
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
#load the dataset
mydata <- read.csv('./data/day.csv')
#convert 'dteday' column to Date format
mydata$dteday <- as.Date(mydata$dteday)

#season
mydata$season <- cut(mydata$season,
                     breaks = c(0.5, 1.5, 2.5, 3.5, 4.5),
                     labels = c("Winter", "Spring", "Summer", "Fall"))
mydata$season <- factor(mydata$season, levels = c("Winter", "Spring", "Summer", "Fall"))

#workingday
mydata$workingday <- ifelse(mydata$workingday == 0, "Not_Workingday", "Workingday")
mydata$workingday <- factor(mydata$workingday, levels = c("Not_Workingday", "Workingday"))

#weather
mydata$weathersit <- cut(mydata$weathersit,
                         breaks = c(0.5, 1.5, 2.5, 3.5, 4.5),
                         labels = c("Weather_1", "Weather_2", "Weather_3", "Weather_4"))
mydata$weathersit <- factor(mydata$weathersit, levels = c("Weather_1", "Weather_2", "Weather_3", "Weather_4"))

head(mydata)
##   instant     dteday season yr mnth holiday weekday     workingday weathersit
## 1       1 2011-01-01 Winter  0    1       0       6 Not_Workingday  Weather_2
## 2       2 2011-01-02 Winter  0    1       0       0 Not_Workingday  Weather_2
## 3       3 2011-01-03 Winter  0    1       0       1     Workingday  Weather_1
## 4       4 2011-01-04 Winter  0    1       0       2     Workingday  Weather_1
## 5       5 2011-01-05 Winter  0    1       0       3     Workingday  Weather_1
## 6       6 2011-01-06 Winter  0    1       0       4     Workingday  Weather_1
##       temp    atemp      hum windspeed casual registered  cnt
## 1 0.344167 0.363625 0.805833 0.1604460    331        654  985
## 2 0.363478 0.353739 0.696087 0.2485390    131        670  801
## 3 0.196364 0.189405 0.437273 0.2483090    120       1229 1349
## 4 0.200000 0.212122 0.590435 0.1602960    108       1454 1562
## 5 0.226957 0.229270 0.436957 0.1869000     82       1518 1600
## 6 0.204348 0.233209 0.518261 0.0895652     88       1518 1606

Data Splitting (Training & Validation): We split the data into training (year = 2011) and validation sets (year = 2012)

The year 2012 dataset will be used for prediction.

# Set a seed for reproducibility
# Setting a seed is crucial for reproducibility and consistency of results.
# It ensures that when you run the same code multiple times, you'll ...
# ... obtain the same results.
set.seed(20231103)
# Filter dataset for year 2011 (0) and year 2012 (1)
year2011 <- mydata[mydata$yr == 0, ]
year2012 <- mydata[mydata$yr == 1, ]

# Add 'Type' column and assign values
year2011$Type <- "Training"
year2012$Type <- "Validation"

# Combine training and validation data
mydata <- rbind(year2011, year2012)
#creating training data our of mydata (training -> data from 2011)
training_data <- subset(mydata, Type == "Training")
summary(training_data)
##     instant        dteday              season         yr         mnth       
##  Min.   :  1   Min.   :2011-01-01   Winter:90   Min.   :0   Min.   : 1.000  
##  1st Qu.: 92   1st Qu.:2011-04-02   Spring:92   1st Qu.:0   1st Qu.: 4.000  
##  Median :183   Median :2011-07-02   Summer:94   Median :0   Median : 7.000  
##  Mean   :183   Mean   :2011-07-02   Fall  :89   Mean   :0   Mean   : 6.526  
##  3rd Qu.:274   3rd Qu.:2011-10-01               3rd Qu.:0   3rd Qu.:10.000  
##  Max.   :365   Max.   :2011-12-31               Max.   :0   Max.   :12.000  
##     holiday          weekday               workingday      weathersit 
##  Min.   :0.0000   Min.   :0.000   Not_Workingday:115   Weather_1:226  
##  1st Qu.:0.0000   1st Qu.:1.000   Workingday    :250   Weather_2:124  
##  Median :0.0000   Median :3.000                        Weather_3: 15  
##  Mean   :0.0274   Mean   :3.008                        Weather_4:  0  
##  3rd Qu.:0.0000   3rd Qu.:5.000                                       
##  Max.   :1.0000   Max.   :6.000                                       
##       temp             atemp              hum           windspeed      
##  Min.   :0.05913   Min.   :0.07907   Min.   :0.0000   Min.   :0.02239  
##  1st Qu.:0.32500   1st Qu.:0.32195   1st Qu.:0.5383   1st Qu.:0.13558  
##  Median :0.47917   Median :0.47285   Median :0.6475   Median :0.18690  
##  Mean   :0.48666   Mean   :0.46684   Mean   :0.6437   Mean   :0.19140  
##  3rd Qu.:0.65667   3rd Qu.:0.61238   3rd Qu.:0.7421   3rd Qu.:0.23508  
##  Max.   :0.84917   Max.   :0.84090   Max.   :0.9725   Max.   :0.50746  
##      casual         registered        cnt           Type          
##  Min.   :   9.0   Min.   : 416   Min.   : 431   Length:365        
##  1st Qu.: 222.0   1st Qu.:1730   1st Qu.:2132   Class :character  
##  Median : 614.0   Median :2915   Median :3740   Mode  :character  
##  Mean   : 677.4   Mean   :2728   Mean   :3406                     
##  3rd Qu.: 871.0   3rd Qu.:3632   3rd Qu.:4586                     
##  Max.   :3065.0   Max.   :4614   Max.   :6043
#creating training data our of mydata (validatioin -> data from 2012)
validation_data <- subset(mydata, Type == "Validation")
summary(validation_data)
##     instant          dteday              season         yr         mnth       
##  Min.   :366.0   Min.   :2012-01-01   Winter:91   Min.   :1   Min.   : 1.000  
##  1st Qu.:457.2   1st Qu.:2012-04-01   Spring:92   1st Qu.:1   1st Qu.: 4.000  
##  Median :548.5   Median :2012-07-01   Summer:94   Median :1   Median : 7.000  
##  Mean   :548.5   Mean   :2012-07-01   Fall  :89   Mean   :1   Mean   : 6.514  
##  3rd Qu.:639.8   3rd Qu.:2012-09-30               3rd Qu.:1   3rd Qu.: 9.750  
##  Max.   :731.0   Max.   :2012-12-31               Max.   :1   Max.   :12.000  
##     holiday           weekday               workingday      weathersit 
##  Min.   :0.00000   Min.   :0.000   Not_Workingday:116   Weather_1:237  
##  1st Qu.:0.00000   1st Qu.:1.000   Workingday    :250   Weather_2:123  
##  Median :0.00000   Median :3.000                        Weather_3:  6  
##  Mean   :0.03005   Mean   :2.986                        Weather_4:  0  
##  3rd Qu.:0.00000   3rd Qu.:5.000                                       
##  Max.   :1.00000   Max.   :6.000                                       
##       temp            atemp             hum           windspeed      
##  Min.   :0.1075   Min.   :0.1017   Min.   :0.2542   Min.   :0.04665  
##  1st Qu.:0.3477   1st Qu.:0.3507   1st Qu.:0.5081   1st Qu.:0.13372  
##  Median :0.5142   Median :0.4978   Median :0.6119   Median :0.17475  
##  Mean   :0.5041   Mean   :0.4819   Mean   :0.6122   Mean   :0.18957  
##  3rd Qu.:0.6540   3rd Qu.:0.6076   3rd Qu.:0.7111   3rd Qu.:0.23120  
##  Max.   :0.8617   Max.   :0.8049   Max.   :0.9250   Max.   :0.44156  
##      casual         registered        cnt           Type          
##  Min.   :   2.0   Min.   :  20   Min.   :  22   Length:366        
##  1st Qu.: 429.8   1st Qu.:3730   1st Qu.:4369   Class :character  
##  Median : 904.5   Median :4776   Median :5927   Mode  :character  
##  Mean   :1018.5   Mean   :4581   Mean   :5600                     
##  3rd Qu.:1262.0   3rd Qu.:5663   3rd Qu.:7011                     
##  Max.   :3410.0   Max.   :6946   Max.   :8714

We choose 3 questions of interest:

  1. How does working day affect the number of registered users? where Y is the numnber of registered users
  2. How does season affect the number of casual users? where Y is the sqrt(casual users)
  3. Is weather correlated to the number of bike rentals?* where Y is the count of bikes rented

Datasets Comparison: Training vs. Validation

Boxplots for proposed covariates y

#registered
ggplot(mydata, aes(x = Type, y = registered, color = Type)) +
  geom_boxplot(position = position_dodge(width = 0.8)) +
  labs(x = "Type", y = "# Registered Users") +
  scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
  ggtitle("# Registered Users by Type (Training vs. Validation)") +
  theme_bw()

#sqrt(casual)
ggplot(mydata, aes(x = Type, y = sqrt(casual), color = Type)) +
  geom_boxplot(position = position_dodge(width = 0.8)) +
  labs(x = "Type", y = "Square Root # Casual Users") +
  scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
  ggtitle("Square Root # Casual Users by Type (Training vs. Validation)") +
  theme_bw()

#cnt
ggplot(mydata, aes(x = Type, y = cnt, color = Type)) +
  geom_boxplot(position = position_dodge(width = 0.8)) +
  labs(x = "Type", y = "# Total Users") +
  scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
  ggtitle("# Total Users by Type (Training vs. Validation)") +
  theme_bw()

# Function to calculate statistics
calculate_statistics <- function(x) {
  return(c(
    Min = min(x, na.rm = TRUE),
    Q1 = quantile(x, 0.25, na.rm = TRUE),
    Median = median(x, na.rm = TRUE),
    Mean = mean(x, na.rm = TRUE),
    Q3 = quantile(x, 0.75, na.rm = TRUE),
    Max = max(x, na.rm = TRUE)
  ))
}

Histogram for Number of Bikes Rented

The Histogram shows that the distribution of cnt in training dataset shows a bimodal shape which has two peaks, while the validation shows left skewed with only one peak.

par(mfrow = c(1, 2))
hist(training_data$cnt, main = "Training Set Histogram", xlab = "# bikes rented", col = "blue")
hist(validation_data$cnt, main = "Validation Set Histogram", xlab = "# bikes rented", col = "red")

par(mfrow = c(1, 1))

Histogram for Number of Registered Users and Sqrt_Casual Users

The histograms for registered users are similarly normally distributed, and both are slightly right skewed. The histogram of sqrt(casual) users for each data set shows similarity.

#for registered
par(mfrow = c(1, 2))
hist(training_data$registered, main = "Training Set Histogram", xlab = "registered users", col = "blue")
hist(validation_data$registered, main = "Validation Set Histogram", xlab = "registered users", col = "red")

par(mfrow = c(1, 1))

#for casual
par(mfrow = c(1, 2))
hist(sqrt(training_data$casual), main = "Training Set Histogram", xlab = "sqrt casual users", col = "blue")
hist(sqrt(validation_data$casual), main = "Validation Set Histogram", xlab = "sqrt casual users", col = "red")

par(mfrow = c(1, 1))

Categorical Predictors: [weather, season, workingday] Continuous Predictors: [temperature, feeling temperature, wind speed, humidity]

Counts and Proportions For categorical

The categorical variables in the two data sets have similar counts and proportions.

#model1 <- lm(registered~as.factor(workingday) + as.factor(yr) + as.factor(weather))
#model2 <- lm(sqrt(casual)~as.factor(season) + as.factor(yr) + as.factor(weather) + as.factor(workingday))
#model3 <- lm(cnt ~  temp + wind + hum + as.factor(yr) + as.factor(workingday))

# WorkingDay
training_counts_workingday <- table(training_data$workingday)
validation_counts_workingday <- table(validation_data$workingday)

training_props_workingday <- prop.table(training_counts_workingday)
validation_props_workingday <- prop.table(validation_counts_workingday)

# Season
training_counts_season <- table(training_data$season)
validation_counts_season <- table(validation_data$season)

training_props_season <- prop.table(training_counts_season)
validation_props_season <- prop.table(validation_counts_season)

# Year
training_counts_yr <- table(training_data$yr)
validation_counts_yr <- table(validation_data$yr)

training_props_yr <- prop.table(training_counts_yr)
validation_props_yr <- prop.table(validation_counts_yr)

# Holiday
training_counts_holiday <- table(training_data$holiday)
validation_counts_holiday <- table(validation_data$holiday)

training_props_holiday <- prop.table(training_counts_holiday)
validation_props_holiday <- prop.table(validation_counts_holiday)

# Weathersit
training_counts_weathersit <- table(training_data$weathersit)
validation_counts_weathersit <- table(validation_data$weathersit)

training_props_weathersit <- prop.table(training_counts_weathersit)
validation_props_weathersit <- prop.table(validation_counts_weathersit)
# Printing out the tables
# For a better layout, you might want to print these one by one or use a custom layout with a package like gridExtra
#working day
cat("Training Set - Working Day Count and Proportion")
## Training Set - Working Day Count and Proportion
print(training_counts_workingday)
## 
## Not_Workingday     Workingday 
##            115            250
print(training_props_workingday)
## 
## Not_Workingday     Workingday 
##      0.3150685      0.6849315
cat("\nValidation Set - Working Day Count and Proportion")
## 
## Validation Set - Working Day Count and Proportion
print(validation_counts_workingday)
## 
## Not_Workingday     Workingday 
##            116            250
print(validation_props_workingday)
## 
## Not_Workingday     Workingday 
##      0.3169399      0.6830601
cat("\n")
#season
cat("Training Set - Season Count and Proportion")
## Training Set - Season Count and Proportion
print(training_counts_season)
## 
## Winter Spring Summer   Fall 
##     90     92     94     89
print(training_props_season)
## 
##    Winter    Spring    Summer      Fall 
## 0.2465753 0.2520548 0.2575342 0.2438356
cat("\nValidation Set - Season Count and Proportion")
## 
## Validation Set - Season Count and Proportion
print(validation_counts_season)
## 
## Winter Spring Summer   Fall 
##     91     92     94     89
print(validation_props_season)
## 
##    Winter    Spring    Summer      Fall 
## 0.2486339 0.2513661 0.2568306 0.2431694
cat("\n")
#year
cat("Training Set - Year Count and Proportion")
## Training Set - Year Count and Proportion
print(training_counts_yr)
## 
##   0 
## 365
print(training_props_yr)
## 
## 0 
## 1
cat("\nValidation Set - Year Count and Proportion")
## 
## Validation Set - Year Count and Proportion
print(validation_counts_yr)
## 
##   1 
## 366
print(validation_props_yr)
## 
## 1 
## 1
cat("\n")
#holiday
cat("Training Set - Holiday Count and Proportion")
## Training Set - Holiday Count and Proportion
print(training_counts_holiday)
## 
##   0   1 
## 355  10
print(training_props_holiday)
## 
##          0          1 
## 0.97260274 0.02739726
cat("\nValidation Set - Holiday Count and Proportion")
## 
## Validation Set - Holiday Count and Proportion
print(validation_counts_holiday)
## 
##   0   1 
## 355  11
print(validation_props_holiday)
## 
##          0          1 
## 0.96994536 0.03005464
cat("\n")
#weathersit
cat("Training Set - Weathersit Count and Proportion")
## Training Set - Weathersit Count and Proportion
print(training_counts_weathersit)
## 
## Weather_1 Weather_2 Weather_3 Weather_4 
##       226       124        15         0
print(training_props_weathersit)
## 
##  Weather_1  Weather_2  Weather_3  Weather_4 
## 0.61917808 0.33972603 0.04109589 0.00000000
cat("\nValidation Set - Weathersit Count and Proportion")
## 
## Validation Set - Weathersit Count and Proportion
print(validation_counts_weathersit)
## 
## Weather_1 Weather_2 Weather_3 Weather_4 
##       237       123         6         0
print(validation_props_weathersit)
## 
##  Weather_1  Weather_2  Weather_3  Weather_4 
## 0.64754098 0.33606557 0.01639344 0.00000000

Histograms For continuous Predictors

All four continuous variables are similarly distributed in the two data sets. In general, the temperature in 2012 is higher than in 2011, and the humidity is relatively lower and more widely spread in 2012 than that in 2011. Also, the wind speed is generally higher in 2012.

#For temp
par(mfrow = c(1, 2))
hist(training_data$temp, main = "Training Set Histogram", xlab = "temperature", col = "blue")
hist(validation_data$temp, main = "Validation Set Histogram", xlab = "temperature", col = "red")

par(mfrow = c(1, 1))

#For feeling temp
par(mfrow = c(1, 2))
hist(training_data$atemp, main = "Training Set Histogram", xlab = "feeling temperature", col = "blue")
hist(validation_data$atemp, main = "Validation Set Histogram", xlab = "feeling temperature", col = "red")

par(mfrow = c(1, 1))

#For hum
par(mfrow = c(1, 2))
hist(training_data$hum, main = "Training Set Histogram", xlab = "humidity", col = "blue")
hist(validation_data$hum, main = "Validation Set Histogram", xlab = "humidity", col = "red")

par(mfrow = c(1, 1))

#For windspeed
par(mfrow = c(1, 2))
hist(training_data$windspeed, main = "Training Set Histogram", xlab = "windspeed", col = "blue")
hist(validation_data$windspeed, main = "Validation Set Histogram", xlab = "windspeed", col = "red")

par(mfrow = c(1, 1))

Model Fitting & obtain outputs of the model

Full models for each question

We are fitting full model which will be used to stepwise regression later. We address multicollinearity issue and delete variables that might cause it.

Viki trail 1

#viki trail 1

full_model1 <- lm(registered ~ as.factor(workingday) + as.factor(season)+ holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season) + holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

full_model3 <- lm( cnt ~ casual +as.factor(workingday) + as.factor(season) + holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

summary(full_model1)
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) + 
##     holiday + weekday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1831.18  -249.18    45.54   306.01  1029.65 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       939.50     182.22   5.156 4.21e-07 ***
## as.factor(workingday)Workingday   698.55      55.36  12.617  < 2e-16 ***
## as.factor(season)Spring           812.16      92.94   8.738  < 2e-16 ***
## as.factor(season)Summer           794.67     121.99   6.514 2.52e-10 ***
## as.factor(season)Fall            1306.52      82.32  15.872  < 2e-16 ***
## holiday                          -196.65     156.92  -1.253 0.210974    
## weekday                            17.57      12.46   1.410 0.159397    
## temp                             2757.41     236.16  11.676  < 2e-16 ***
## hum                              -610.45     230.43  -2.649 0.008431 ** 
## windspeed                       -1469.23     351.94  -4.175 3.76e-05 ***
## as.factor(weathersit)Weather_2   -224.29      66.00  -3.398 0.000756 ***
## as.factor(weathersit)Weather_3  -1377.56     146.15  -9.426  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 471.3 on 353 degrees of freedom
## Multiple R-squared:  0.8083, Adjusted R-squared:  0.8024 
## F-statistic: 135.3 on 11 and 353 DF,  p-value: < 2.2e-16
summary(full_model2)
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) + 
##     holiday + weekday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0473  -3.0247  -0.0296   2.7154  18.3344 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      19.89493    1.84453  10.786  < 2e-16 ***
## as.factor(workingday)Workingday -11.35502    0.56044 -20.261  < 2e-16 ***
## as.factor(season)Spring           6.26683    0.94083   6.661 1.05e-10 ***
## as.factor(season)Summer           3.36323    1.23484   2.724  0.00678 ** 
## as.factor(season)Fall             4.43365    0.83329   5.321 1.84e-07 ***
## holiday                          -2.56032    1.58848  -1.612  0.10790    
## weekday                           0.09252    0.12616   0.733  0.46381    
## temp                             32.15458    2.39057  13.451  < 2e-16 ***
## hum                              -6.06923    2.33258  -2.602  0.00966 ** 
## windspeed                       -14.22007    3.56256  -3.992 7.99e-05 ***
## as.factor(weathersit)Weather_2   -2.09773    0.66811  -3.140  0.00183 ** 
## as.factor(weathersit)Weather_3   -8.83239    1.47944  -5.970 5.78e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.771 on 353 degrees of freedom
## Multiple R-squared:  0.8019, Adjusted R-squared:  0.7957 
## F-statistic: 129.9 on 11 and 353 DF,  p-value: < 2.2e-16
summary(full_model3)
## 
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) + 
##     holiday + weekday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2086.82  -250.88    43.02   316.95   994.65 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6.822e+02  1.847e+02   3.694 0.000255 ***
## casual                           1.393e+00  8.172e-02  17.048  < 2e-16 ***
## as.factor(workingday)Workingday  9.622e+02  7.673e+01  12.540  < 2e-16 ***
## as.factor(season)Spring          7.127e+02  9.250e+01   7.705 1.35e-13 ***
## as.factor(season)Summer          7.396e+02  1.189e+02   6.221 1.40e-09 ***
## as.factor(season)Fall            1.250e+03  8.071e+01  15.488  < 2e-16 ***
## holiday                         -1.457e+02  1.526e+02  -0.955 0.340318    
## weekday                          1.545e+01  1.210e+01   1.277 0.202431    
## temp                             2.167e+03  2.599e+02   8.338 1.72e-15 ***
## hum                             -4.957e+02  2.248e+02  -2.205 0.028098 *  
## windspeed                       -1.132e+03  3.485e+02  -3.248 0.001273 ** 
## as.factor(weathersit)Weather_2  -1.881e+02  6.446e+01  -2.918 0.003754 ** 
## as.factor(weathersit)Weather_3  -1.256e+03  1.440e+02  -8.721  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 457.2 on 352 degrees of freedom
## Multiple R-squared:  0.8937, Adjusted R-squared:  0.8901 
## F-statistic: 246.6 on 12 and 352 DF,  p-value: < 2.2e-16
#plot added variables
avPlots(full_model1)

avPlots(full_model2)

avPlots(full_model3)

Stepwise Regression

stepwise_model1 <- step(full_model1, direction = "both")
## Start:  AIC=4505.31
## registered ~ as.factor(workingday) + as.factor(season) + holiday + 
##     weekday + temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - holiday                1    348826  78757811 4504.9
## <none>                                78408985 4505.3
## - weekday                1    441649  78850634 4505.4
## - hum                    1   1558913  79967898 4510.5
## - windspeed              1   3871169  82280153 4520.9
## - as.factor(weathersit)  2  19753550  98162534 4583.3
## - temp                   1  30282108 108691093 4622.5
## - as.factor(workingday)  1  35361497 113770481 4639.2
## - as.factor(season)      3  60396050 138805035 4707.8
## 
## Step:  AIC=4504.93
## registered ~ as.factor(workingday) + as.factor(season) + weekday + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## <none>                                78757811 4504.9
## - weekday                1    509480  79267291 4505.3
## + holiday                1    348826  78408985 4505.3
## - hum                    1   1478325  80236136 4509.7
## - windspeed              1   3852838  82610649 4520.4
## - as.factor(weathersit)  2  19849983  98607794 4583.0
## - temp                   1  30118568 108876379 4621.1
## - as.factor(workingday)  1  39599907 118357718 4651.6
## - as.factor(season)      3  60285076 139042887 4706.4
stepwise_model2 <- step(full_model2, direction = "both")
## Start:  AIC=1152.44
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday + 
##     weekday + temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq     RSS    AIC
## - weekday                1      12.2  8046.8 1151.0
## <none>                                8034.6 1152.4
## - holiday                1      59.1  8093.7 1153.1
## - hum                    1     154.1  8188.7 1157.4
## - windspeed              1     362.6  8397.2 1166.5
## - as.factor(weathersit)  2     829.0  8863.6 1184.3
## - as.factor(season)      3    1441.3  9475.9 1206.7
## - temp                   1    4117.9 12152.4 1301.5
## - as.factor(workingday)  1    9343.4 17378.0 1432.0
## 
## Step:  AIC=1151
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq     RSS    AIC
## <none>                                8046.8 1151.0
## - holiday                1      63.9  8110.7 1151.9
## + weekday                1      12.2  8034.6 1152.4
## - hum                    1     164.6  8211.5 1156.4
## - windspeed              1     359.7  8406.5 1165.0
## - as.factor(weathersit)  2     818.8  8865.6 1182.4
## - as.factor(season)      3    1445.3  9492.1 1205.3
## - temp                   1    4107.9 12154.7 1299.5
## - as.factor(workingday)  1    9348.6 17395.5 1430.4
stepwise_model3 <- step(full_model3, direction = "both")
## Start:  AIC=4484.06
## cnt ~ casual + as.factor(workingday) + as.factor(season) + holiday + 
##     weekday + temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - holiday                1    190554  73761582 4483.0
## - weekday                1    340857  73911885 4483.7
## <none>                                73571028 4484.1
## - hum                    1   1016235  74587264 4487.1
## - windspeed              1   2205158  75776187 4492.8
## - temp                   1  14531329  88102358 4547.9
## - as.factor(weathersit)  2  15979331  89550360 4551.8
## - as.factor(workingday)  1  32868919 106439948 4616.9
## - as.factor(season)      3  54301778 127872806 4679.8
## - casual                 1  60747394 134318422 4701.8
## 
## Step:  AIC=4483
## cnt ~ casual + as.factor(workingday) + as.factor(season) + weekday + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - weekday                1    383340  74144922 4482.9
## <none>                                73761582 4483.0
## + holiday                1    190554  73571028 4484.1
## - hum                    1    963292  74724874 4485.7
## - windspeed              1   2178046  75939628 4491.6
## - temp                   1  14386687  88148269 4546.0
## - as.factor(weathersit)  2  15987090  89748672 4550.6
## - as.factor(workingday)  1  35790437 109552019 4625.4
## - as.factor(season)      3  54174909 127936491 4678.0
## - casual                 1  61516967 135278549 4702.4
## 
## Step:  AIC=4482.9
## cnt ~ casual + as.factor(workingday) + as.factor(season) + temp + 
##     hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## <none>                                74144922 4482.9
## + weekday                1    383340  73761582 4483.0
## + holiday                1    233037  73911885 4483.7
## - hum                    1   1087039  75231961 4486.2
## - windspeed              1   2121892  76266814 4491.2
## - temp                   1  14190777  88335699 4544.8
## - as.factor(weathersit)  2  15737193  89882115 4549.2
## - as.factor(workingday)  1  36139539 110284461 4625.8
## - as.factor(season)      3  54380050 128524972 4677.7
## - casual                 1  62035815 136180737 4702.8
summary(stepwise_model1)
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) + 
##     weekday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1834.15  -238.70    46.37   304.93  1029.11 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       911.40     180.98   5.036 7.59e-07 ***
## as.factor(workingday)Workingday   715.86      53.66  13.341  < 2e-16 ***
## as.factor(season)Spring           815.27      92.98   8.768  < 2e-16 ***
## as.factor(season)Summer           798.73     122.04   6.545 2.09e-10 ***
## as.factor(season)Fall            1305.98      82.38  15.853  < 2e-16 ***
## weekday                            18.82      12.43   1.513  0.13110    
## temp                             2748.78     236.25  11.635  < 2e-16 ***
## hum                              -593.43     230.21  -2.578  0.01035 *  
## windspeed                       -1465.70     352.21  -4.161 3.97e-05 ***
## as.factor(weathersit)Weather_2   -229.81      65.91  -3.487  0.00055 ***
## as.factor(weathersit)Weather_3  -1381.03     146.24  -9.444  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 471.7 on 354 degrees of freedom
## Multiple R-squared:  0.8075, Adjusted R-squared:  0.802 
## F-statistic: 148.5 on 10 and 354 DF,  p-value: < 2.2e-16
summary(stepwise_model2)
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) + 
##     holiday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.7811  -2.9288  -0.0188   2.7108  18.2147 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      20.2681     1.7718  11.439  < 2e-16 ***
## as.factor(workingday)Workingday -11.3579     0.5601 -20.280  < 2e-16 ***
## as.factor(season)Spring           6.2838     0.9399   6.685 8.99e-11 ***
## as.factor(season)Summer           3.3963     1.2332   2.754  0.00619 ** 
## as.factor(season)Fall             4.4543     0.8323   5.352 1.57e-07 ***
## holiday                          -2.6529     1.5824  -1.677  0.09452 .  
## temp                             32.1003     2.3879  13.443  < 2e-16 ***
## hum                              -6.2416     2.3192  -2.691  0.00746 ** 
## windspeed                       -14.1586     3.5593  -3.978 8.43e-05 ***
## as.factor(weathersit)Weather_2   -2.0546     0.6651  -3.089  0.00217 ** 
## as.factor(weathersit)Weather_3   -8.7637     1.4755  -5.939 6.84e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.768 on 354 degrees of freedom
## Multiple R-squared:  0.8016, Adjusted R-squared:  0.796 
## F-statistic:   143 on 10 and 354 DF,  p-value: < 2.2e-16
summary(stepwise_model3)
## 
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) + 
##     temp + hum + windspeed + as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2060.85  -257.74    44.82   332.76  1011.50 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      7.189e+02  1.775e+02   4.050 6.29e-05 ***
## casual                           1.403e+00  8.153e-02  17.210  < 2e-16 ***
## as.factor(workingday)Workingday  9.826e+02  7.480e+01  13.136  < 2e-16 ***
## as.factor(season)Spring          7.157e+02  9.258e+01   7.730 1.12e-13 ***
## as.factor(season)Summer          7.474e+02  1.189e+02   6.285 9.65e-10 ***
## as.factor(season)Fall            1.252e+03  8.076e+01  15.501  < 2e-16 ***
## temp                             2.135e+03  2.594e+02   8.231 3.60e-15 ***
## hum                             -5.094e+02  2.236e+02  -2.278  0.02331 *  
## windspeed                       -1.110e+03  3.486e+02  -3.183  0.00159 ** 
## as.factor(weathersit)Weather_2  -1.840e+02  6.418e+01  -2.867  0.00439 ** 
## as.factor(weathersit)Weather_3  -1.243e+03  1.438e+02  -8.645  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 457.7 on 354 degrees of freedom
## Multiple R-squared:  0.8928, Adjusted R-squared:  0.8898 
## F-statistic:   295 on 10 and 354 DF,  p-value: < 2.2e-16

model 1: R^2 = 0.8075

model 2: R^2 = 0.8016

model 3: R^2 = 0.8928

avPlots(stepwise_model1)

avPlots(stepwise_model2)

avPlots(stepwise_model3)

For vif, values close to 1 indicate no multicollinearity, values between 1-3 indicate mild collinearity which is not ideal but can be considered ok since it won’t severaly impact the interpretability of the coefficients, values above indicate severe multicollinearity which is considered unreliable.

vif(stepwise_model1)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019319  1        1.009613
## as.factor(season)     3.639198  3        1.240226
## weekday               1.017995  1        1.008957
## temp                  3.282494  1        1.811766
## hum                   1.918458  1        1.385084
## windspeed             1.199915  1        1.095406
## as.factor(weathersit) 1.834088  2        1.163737
vif(stepwise_model2)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.086910  1        1.042550
## as.factor(season)     3.640064  3        1.240276
## holiday               1.071406  1        1.035087
## temp                  3.282132  1        1.811666
## hum                   1.905606  1        1.380437
## windspeed             1.199328  1        1.095138
## as.factor(weathersit) 1.826326  2        1.162504
vif(stepwise_model3)
##                           GVIF Df GVIF^(1/(2*Df))
## casual                3.575021  1        1.890773
## as.factor(workingday) 2.104307  1        1.450623
## as.factor(season)     3.879645  3        1.253522
## temp                  4.203539  1        2.050253
## hum                   1.922350  1        1.386488
## windspeed             1.248688  1        1.117447
## as.factor(weathersit) 1.881977  2        1.171261

Validation

validation_data$predicted_registered <- predict(stepwise_model1, newdata = validation_data)
validation_data$predicted_sqrtcasual <- predict(stepwise_model2, newdata = validation_data)
validation_data$predicted_count <- predict(stepwise_model3, newdata = validation_data)

Note: here in the predicitons I am unsure why the firts one is negative, in thesecond quetsion, I am unsure whether I should square the original value or not (not sure wthere it predicts the squared casaul count or not). In the third question, it produces Nans???

observed_values1 <- validation_data$registered
predicted_values1 <- validation_data$predicted_registered
# Calculate different prediction performance metrics
# Functions from the MLmetrics package
# Common regression metrics
# Calculate the Root Mean Squared Error (RMSE), which measures the ...
# ... average magnitude of prediction errors.
# Lower is better.
rmse1 <- RMSE(predicted_values1, observed_values1)
# same as
# sqrt(mean((observed_values - predicted_values)^2))

# Compute the Mean Absolute Error (MAE), indicating the average absolute ...
# ... difference between predicted and observed values.
# Lower is better.
mae1 <- MAE(predicted_values1, observed_values1)
# same as
# mean(abs(observed_values - predicted_values))

# Calculate the Mean Absolute Percentage Error (MAPE), measuring the ...
# ... average percentage difference between predicted and observed values.
mape1 <- MAPE(predicted_values1, observed_values1)
# same as
# mean(abs(predicted_values-observed_values)/observed_values)

# Determine the R-squared (R²) Score, representing the proportion of the ...
# ... variance in the observed values (of validation data set) ... 
# ... explained by the predicted values from the model.
# Higher is better.
r_squared1 <- R2_Score(predicted_values1, observed_values1)
# same as
# summary(lm(observed_values ~ predicted_values))$r.squared

# Display the calculated metrics
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1929.403
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 1799.901
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: -0.84
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6342
#second question
observed_values2 <- sqrt(validation_data$casual) #should I transform it here??
predicted_values2 <- validation_data$predicted_sqrtcasual
rmse2 <- RMSE(predicted_values2, observed_values2)
mae2 <- MAE(predicted_values2, observed_values2)
mape2 <- MAPE(predicted_values2, observed_values2)
r_squared2 <- R2_Score(predicted_values2, observed_values2)
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 7.5939
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 5.9561
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.5865
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2126
#third question
observed_values3 <- validation_data$count
predicted_values3 <- validation_data$predicted_count
rmse3 <- RMSE(predicted_values3, observed_values3)
mae3 <- MAE(predicted_values3, observed_values3)
mape3 <- MAPE(predicted_values3, observed_values3)
r_squared3 <- R2_Score(predicted_values3, observed_values3)
## Warning in mean.default(y_true): argument is not numeric or logical: returning
## NA
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): NaN
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): NaN
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: NaN
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): NaN

not working for model 3??

Diagnostics of Training model

we choose model 1 because it has best RMSE for validation

#stepwise regression on model 1
m.mlr <- lm(registered ~ as.factor(workingday) + as.factor(season) +
    temp + windspeed + as.factor(weathersit),
                 data = training_data)

#original regression question 1
m.mlr1 <- lm(registered ~ as.factor(workingday) + as.factor(weathersit),
                 data = training_data)

#question2
m.mlr2 <- lm(sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday + 
  temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)
diagnostics_df <- data.frame(
  Residuals = resid(m.mlr),
  Fitted_Values = fitted(m.mlr),
  Standardized_Residuals = rstandard(m.mlr),
  Leverage = hatvalues(m.mlr)
  #Date = year2011$dteday
)

# Create the standardized residuals vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = Residuals)) +
  geom_point(col="blue", alpha=0.75) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "Residuals") +
  theme_bw()

# Create the QQ plot
ggplot(diagnostics_df, aes(sample = Standardized_Residuals)) +
  stat_qq(aes(sample = Standardized_Residuals), distribution = qnorm,
          size = 2, col="blue", alpha = 0.75) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Standardized Residual QQ Plot",
       x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_bw()

# Create the sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = sqrt(abs(Standardized_Residuals)))) +
  geom_point(col="blue", alpha=0.75) +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
  theme_bw()

# Leverage vs Standardized Residuals
ggplot(diagnostics_df, aes(x = Leverage, y = Standardized_Residuals)) +
  geom_point(alpha = 0.75) +
  labs(title = "Standardized Residuals vs. Leverage Plot",
       x = "Leverage", y = "Standardized Residuals") +
  theme_bw()

Prediction

validation2012<- subset(validation_data)
validation2012$predicted_registered <- predict(m.mlr, newdata = validation2012)
summary(validation2012$predicted_registered)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -673.6  2234.2  3031.8  2812.5  3492.8  4182.6
# Extract observed and predicted values
observed_values <- validation2012$registered
predicted_values <- validation2012$predicted_registered
rmse <- RMSE(predicted_values, observed_values)
mae <- MAE(predicted_values, observed_values)
mape <- MAPE(predicted_values, observed_values)
r_squared <- R2_Score(predicted_values, observed_values)


cat("Root Mean Squared Error (RMSE):", round(rmse, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1949.03
cat("Mean Absolute Error (MAE):", round(mae, digits = 4), "\n")
## Mean Absolute Error (MAE): 1815.744
cat("R-squared (R^2) Score:", round(r_squared, digits = 4), "\n")
## R-squared (R^2) Score: -0.8776
cat("Mean Absolute Percentage Error (MPE):", round(mape, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6476
ggplot(validation2012, aes(x = observed_values, y = predicted_values)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(x = "Observed Values", y = "Predicted Values",
       title = "Observed vs. Predicted Values") +
  theme_bw()

# Residuals plot
ggplot(validation2012, aes(x = 1:nrow(validation2012), y = observed_values-predicted_values)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
  labs(x = "Observation Index", y = "Residuals",
       title = "Observed vs. Predicted Values") +
  theme_bw()

Kaylee Trial 1

#kaylee trial 1

full_model1 <- lm(registered ~ as.factor(workingday) + as.factor(season)+ as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season)+ as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

full_model3 <- lm( cnt ~ casual+as.factor(workingday) + as.factor(season)+ as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

summary(full_model1)
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) + 
##     as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + 
##     as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1779.75  -237.37    62.58   308.19  1085.42 
## 
## Coefficients: (1 not defined because of singularities)
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       930.21     184.49   5.042 7.42e-07 ***
## as.factor(workingday)Workingday   746.78      93.56   7.982 2.10e-14 ***
## as.factor(season)Spring           816.26      93.31   8.747  < 2e-16 ***
## as.factor(season)Summer           799.01     122.44   6.526 2.39e-10 ***
## as.factor(season)Fall            1309.72      82.70  15.838  < 2e-16 ***
## as.factor(holiday)1              -114.41     178.98  -0.639 0.523072    
## as.factor(weekday)1               -49.07      94.39  -0.520 0.603455    
## as.factor(weekday)2                58.64      93.37   0.628 0.530438    
## as.factor(weekday)3                41.91      94.55   0.443 0.657891    
## as.factor(weekday)4                49.53      93.32   0.531 0.595893    
## as.factor(weekday)5                   NA         NA      NA       NA    
## as.factor(weekday)6               136.82      92.73   1.476 0.140967    
## temp                             2744.64     237.11  11.575  < 2e-16 ***
## hum                              -610.19     232.17  -2.628 0.008963 ** 
## windspeed                       -1480.21     353.67  -4.185 3.61e-05 ***
## as.factor(weathersit)Weather_2   -227.82      66.36  -3.433 0.000669 ***
## as.factor(weathersit)Weather_3  -1401.20     148.05  -9.465  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 472.7 on 349 degrees of freedom
## Multiple R-squared:  0.8094, Adjusted R-squared:  0.8012 
## F-statistic: 98.77 on 15 and 349 DF,  p-value: < 2.2e-16
summary(full_model2)
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) + 
##     as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + 
##     as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0744  -2.6269   0.0553   2.7719  18.2465 
## 
## Coefficients: (1 not defined because of singularities)
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      19.8250     1.8394  10.778  < 2e-16 ***
## as.factor(workingday)Workingday  -9.4840     0.9328 -10.168  < 2e-16 ***
## as.factor(season)Spring           6.1465     0.9304   6.607 1.47e-10 ***
## as.factor(season)Summer           3.2409     1.2208   2.655 0.008298 ** 
## as.factor(season)Fall             4.3307     0.8245   5.253 2.61e-07 ***
## as.factor(holiday)1              -1.5567     1.7844  -0.872 0.383605    
## as.factor(weekday)1              -1.0232     0.9411  -1.087 0.277666    
## as.factor(weekday)2              -2.2992     0.9309  -2.470 0.013998 *  
## as.factor(weekday)3              -2.8697     0.9427  -3.044 0.002511 ** 
## as.factor(weekday)4              -2.6316     0.9304  -2.829 0.004946 ** 
## as.factor(weekday)5                   NA         NA      NA       NA    
## as.factor(weekday)6               0.2164     0.9245   0.234 0.815070    
## temp                             32.5149     2.3640  13.754  < 2e-16 ***
## hum                              -6.0657     2.3148  -2.620 0.009165 ** 
## windspeed                       -13.6013     3.5261  -3.857 0.000137 ***
## as.factor(weathersit)Weather_2   -2.0200     0.6616  -3.053 0.002439 ** 
## as.factor(weathersit)Weather_3   -8.1185     1.4760  -5.500 7.35e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.713 on 349 degrees of freedom
## Multiple R-squared:  0.8088, Adjusted R-squared:  0.8006 
## F-statistic: 98.44 on 15 and 349 DF,  p-value: < 2.2e-16
summary(full_model3)
## 
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) + 
##     as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + 
##     as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2023.28  -241.28    44.38   312.36   992.46 
## 
## Coefficients: (1 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6.626e+02  1.863e+02   3.557 0.000427 ***
## casual                           1.413e+00  8.248e-02  17.133  < 2e-16 ***
## as.factor(workingday)Workingday  9.866e+02  1.024e+02   9.637  < 2e-16 ***
## as.factor(season)Spring          7.140e+02  9.253e+01   7.716 1.28e-13 ***
## as.factor(season)Summer          7.434e+02  1.189e+02   6.250 1.20e-09 ***
## as.factor(season)Fall            1.252e+03  8.080e+01  15.499  < 2e-16 ***
## as.factor(holiday)1             -8.103e+01  1.732e+02  -0.468 0.640274    
## as.factor(weekday)1             -3.137e+01  9.136e+01  -0.343 0.731555    
## as.factor(weekday)2              9.987e+01  9.068e+01   1.101 0.271516    
## as.factor(weekday)3              9.560e+01  9.208e+01   1.038 0.299872    
## as.factor(weekday)4              9.781e+01  9.077e+01   1.078 0.281978    
## as.factor(weekday)5                     NA         NA      NA       NA    
## as.factor(weekday)6              1.269e+02  8.970e+01   1.415 0.157935    
## temp                             2.118e+03  2.613e+02   8.106 9.01e-15 ***
## hum                             -4.896e+02  2.258e+02  -2.168 0.030838 *  
## windspeed                       -1.137e+03  3.488e+02  -3.261 0.001221 ** 
## as.factor(weathersit)Weather_2  -1.913e+02  6.460e+01  -2.961 0.003278 ** 
## as.factor(weathersit)Weather_3  -1.286e+03  1.450e+02  -8.872  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 457.2 on 348 degrees of freedom
## Multiple R-squared:  0.8949, Adjusted R-squared:   0.89 
## F-statistic: 185.1 on 16 and 348 DF,  p-value: < 2.2e-16
#plot added variables
avPlots(full_model1)
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

avPlots(full_model2)
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

avPlots(full_model3)
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

Compared with Viki’s first trial, the AIC for model2 improved as well.(1151 to 1147)

stepwise_model1 <- step(full_model1, direction = "both")
## Start:  AIC=4511.35
## registered ~ as.factor(workingday) + as.factor(season) + as.factor(holiday) + 
##     as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit)
## 
## 
## Step:  AIC=4511.35
## registered ~ as.factor(workingday) + as.factor(season) + as.factor(weekday) + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - as.factor(weekday)     6   1278049  79267291 4505.3
## <none>                                77989242 4511.3
## - hum                    1   1543574  79532816 4516.5
## - windspeed              1   3914363  81903605 4527.2
## - as.factor(workingday)  1   6617374  84606616 4539.1
## - as.factor(weathersit)  2  20032722  98021964 4590.8
## - temp                   1  29941742 107930985 4627.9
## - as.factor(season)      3  60480699 138469941 4714.9
## 
## Step:  AIC=4505.28
## registered ~ as.factor(workingday) + as.factor(season) + temp + 
##     hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## <none>                                79267291 4505.3
## + as.factor(holiday)     1    416658  78850634 4505.4
## - hum                    1   1666159  80933450 4510.9
## + as.factor(weekday)     6   1278049  77989242 4511.3
## - windspeed              1   3787445  83054736 4520.3
## - as.factor(weathersit)  2  19539168  98806460 4581.7
## - temp                   1  29890751 109158042 4620.1
## - as.factor(workingday)  1  39726359 118993650 4651.6
## - as.factor(season)      3  60612796 139880087 4706.6
stepwise_model2 <- step(full_model2, direction = "both")
## Start:  AIC=1147.39
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + as.factor(holiday) + 
##     as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit)
## 
## 
## Step:  AIC=1147.39
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + as.factor(weekday) + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq     RSS    AIC
## <none>                                7752.3 1147.4
## - as.factor(weekday)     6     358.4  8110.7 1151.9
## - hum                    1     152.5  7904.9 1152.5
## - windspeed              1     330.5  8082.8 1160.6
## - as.factor(workingday)  1     560.7  8313.0 1170.9
## - as.factor(weathersit)  2     694.1  8446.4 1174.7
## - as.factor(season)      3    1393.8  9146.2 1201.7
## - temp                   1    4202.1 11954.5 1303.5
stepwise_model3 <- step(full_model3, direction = "both")
## Start:  AIC=4487.94
## cnt ~ casual + as.factor(workingday) + as.factor(season) + as.factor(holiday) + 
##     as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit)
## 
## 
## Step:  AIC=4487.94
## cnt ~ casual + as.factor(workingday) + as.factor(season) + as.factor(weekday) + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - as.factor(weekday)     6   1399230  74144922 4482.9
## <none>                                72745692 4487.9
## - hum                    1    982509  73728201 4490.8
## - windspeed              1   2222597  74968289 4496.9
## - as.factor(workingday)  1   9482947  82228639 4530.7
## - temp                   1  13734596  86480288 4549.1
## - as.factor(weathersit)  2  16535261  89280953 4558.7
## - as.factor(season)      3  54344672 127090364 4685.6
## - casual                 1  61362707 134108399 4709.2
## 
## Step:  AIC=4482.9
## cnt ~ casual + as.factor(workingday) + as.factor(season) + temp + 
##     hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## <none>                                74144922 4482.9
## + as.factor(holiday)     1    233037  73911885 4483.7
## - hum                    1   1087039  75231961 4486.2
## + as.factor(weekday)     6   1399230  72745692 4487.9
## - windspeed              1   2121892  76266814 4491.2
## - temp                   1  14190777  88335699 4544.8
## - as.factor(weathersit)  2  15737193  89882115 4549.2
## - as.factor(workingday)  1  36139539 110284461 4625.8
## - as.factor(season)      3  54380050 128524972 4677.7
## - casual                 1  62035815 136180737 4702.8
summary(stepwise_model1)
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) + 
##     temp + hum + windspeed + as.factor(weathersit), data = training_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1794.6  -246.9    47.2   311.4  1056.3 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       985.06     174.62   5.641 3.46e-08 ***
## as.factor(workingday)Workingday   716.94      53.75  13.338  < 2e-16 ***
## as.factor(season)Spring           819.04      93.12   8.796  < 2e-16 ***
## as.factor(season)Summer           805.89     122.17   6.596 1.53e-10 ***
## as.factor(season)Fall            1310.16      82.49  15.883  < 2e-16 ***
## temp                             2736.84     236.54  11.570  < 2e-16 ***
## hum                              -627.06     229.55  -2.732 0.006617 ** 
## windspeed                       -1452.78     352.74  -4.119 4.75e-05 ***
## as.factor(weathersit)Weather_2   -221.52      65.80  -3.367 0.000844 ***
## as.factor(weathersit)Weather_3  -1367.32     146.23  -9.351  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 472.5 on 355 degrees of freedom
## Multiple R-squared:  0.8062, Adjusted R-squared:  0.8013 
## F-statistic: 164.1 on 9 and 355 DF,  p-value: < 2.2e-16
summary(stepwise_model2)
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) + 
##     as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0744  -2.6269   0.0553   2.7719  18.2465 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      19.8250     1.8394  10.778  < 2e-16 ***
## as.factor(workingday)Workingday  -7.9273     1.5778  -5.024 8.09e-07 ***
## as.factor(season)Spring           6.1465     0.9304   6.607 1.47e-10 ***
## as.factor(season)Summer           3.2409     1.2208   2.655 0.008298 ** 
## as.factor(season)Fall             4.3307     0.8245   5.253 2.61e-07 ***
## as.factor(weekday)1              -2.5799     1.6509  -1.563 0.119019    
## as.factor(weekday)2              -3.8559     1.8350  -2.101 0.036332 *  
## as.factor(weekday)3              -4.4264     1.8368  -2.410 0.016478 *  
## as.factor(weekday)4              -4.1883     1.8147  -2.308 0.021585 *  
## as.factor(weekday)5              -1.5567     1.7844  -0.872 0.383605    
## as.factor(weekday)6               0.2164     0.9245   0.234 0.815070    
## temp                             32.5149     2.3640  13.754  < 2e-16 ***
## hum                              -6.0657     2.3148  -2.620 0.009165 ** 
## windspeed                       -13.6013     3.5261  -3.857 0.000137 ***
## as.factor(weathersit)Weather_2   -2.0200     0.6616  -3.053 0.002439 ** 
## as.factor(weathersit)Weather_3   -8.1185     1.4760  -5.500 7.35e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.713 on 349 degrees of freedom
## Multiple R-squared:  0.8088, Adjusted R-squared:  0.8006 
## F-statistic: 98.44 on 15 and 349 DF,  p-value: < 2.2e-16
summary(stepwise_model3)
## 
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) + 
##     temp + hum + windspeed + as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2060.85  -257.74    44.82   332.76  1011.50 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      7.189e+02  1.775e+02   4.050 6.29e-05 ***
## casual                           1.403e+00  8.153e-02  17.210  < 2e-16 ***
## as.factor(workingday)Workingday  9.826e+02  7.480e+01  13.136  < 2e-16 ***
## as.factor(season)Spring          7.157e+02  9.258e+01   7.730 1.12e-13 ***
## as.factor(season)Summer          7.474e+02  1.189e+02   6.285 9.65e-10 ***
## as.factor(season)Fall            1.252e+03  8.076e+01  15.501  < 2e-16 ***
## temp                             2.135e+03  2.594e+02   8.231 3.60e-15 ***
## hum                             -5.094e+02  2.236e+02  -2.278  0.02331 *  
## windspeed                       -1.110e+03  3.486e+02  -3.183  0.00159 ** 
## as.factor(weathersit)Weather_2  -1.840e+02  6.418e+01  -2.867  0.00439 ** 
## as.factor(weathersit)Weather_3  -1.243e+03  1.438e+02  -8.645  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 457.7 on 354 degrees of freedom
## Multiple R-squared:  0.8928, Adjusted R-squared:  0.8898 
## F-statistic:   295 on 10 and 354 DF,  p-value: < 2.2e-16
avPlots(stepwise_model1)

avPlots(stepwise_model2)

avPlots(stepwise_model3)

vif(stepwise_model1)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019139  1        1.009524
## as.factor(season)     3.632998  3        1.239874
## temp                  3.278830  1        1.810754
## hum                   1.900579  1        1.378615
## windspeed             1.199210  1        1.095085
## as.factor(weathersit) 1.819527  2        1.161421
vif(stepwise_model2)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 8.827923  1        2.971182
## as.factor(season)     3.659745  3        1.241391
## as.factor(weekday)    9.484996  6        1.206201
## temp                  3.291928  1        1.814367
## hum                   1.942591  1        1.393769
## windspeed             1.204556  1        1.097523
## as.factor(weathersit) 1.891041  2        1.172668
vif(stepwise_model3)
##                           GVIF Df GVIF^(1/(2*Df))
## casual                3.575021  1        1.890773
## as.factor(workingday) 2.104307  1        1.450623
## as.factor(season)     3.879645  3        1.253522
## temp                  4.203539  1        2.050253
## hum                   1.922350  1        1.386488
## windspeed             1.248688  1        1.117447
## as.factor(weathersit) 1.881977  2        1.171261

The above vif is the result after doing following changes: Because of high multicollinearity of workingday and weekday in model2, we will remove one of them from model2. We also remove sqrt from sqrt(casual) in model 3, which changes back to casual, the multicollinearity reduces because GVIF of casual goes down from 4.88 to 3.57, but the R-squared for model3 reduces from 90% to 88%, which is good because effect of overfitting reduces.

Kaylee Trial 2

#kaylee trial 2

full_model1 <- lm(registered ~ as.factor(workingday) + as.factor(season)+ as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season)+ temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

full_model3 <- lm( cnt ~ casual+as.factor(workingday) + as.factor(season)+ as.factor(holiday) + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

summary(full_model1)
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) + 
##     as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + 
##     as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1779.75  -237.37    62.58   308.19  1085.42 
## 
## Coefficients: (1 not defined because of singularities)
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       930.21     184.49   5.042 7.42e-07 ***
## as.factor(workingday)Workingday   746.78      93.56   7.982 2.10e-14 ***
## as.factor(season)Spring           816.26      93.31   8.747  < 2e-16 ***
## as.factor(season)Summer           799.01     122.44   6.526 2.39e-10 ***
## as.factor(season)Fall            1309.72      82.70  15.838  < 2e-16 ***
## as.factor(holiday)1              -114.41     178.98  -0.639 0.523072    
## as.factor(weekday)1               -49.07      94.39  -0.520 0.603455    
## as.factor(weekday)2                58.64      93.37   0.628 0.530438    
## as.factor(weekday)3                41.91      94.55   0.443 0.657891    
## as.factor(weekday)4                49.53      93.32   0.531 0.595893    
## as.factor(weekday)5                   NA         NA      NA       NA    
## as.factor(weekday)6               136.82      92.73   1.476 0.140967    
## temp                             2744.64     237.11  11.575  < 2e-16 ***
## hum                              -610.19     232.17  -2.628 0.008963 ** 
## windspeed                       -1480.21     353.67  -4.185 3.61e-05 ***
## as.factor(weathersit)Weather_2   -227.82      66.36  -3.433 0.000669 ***
## as.factor(weathersit)Weather_3  -1401.20     148.05  -9.465  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 472.7 on 349 degrees of freedom
## Multiple R-squared:  0.8094, Adjusted R-squared:  0.8012 
## F-statistic: 98.77 on 15 and 349 DF,  p-value: < 2.2e-16
summary(full_model2)
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) + 
##     temp + hum + windspeed + as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5673  -2.9959  -0.0408   2.7551  15.8412 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      19.9546     1.7664  11.297  < 2e-16 ***
## as.factor(workingday)Workingday -11.1234     0.5437 -20.459  < 2e-16 ***
## as.factor(season)Spring           6.3291     0.9419   6.719 7.29e-11 ***
## as.factor(season)Summer           3.4574     1.2358   2.798  0.00543 ** 
## as.factor(season)Fall             4.4509     0.8344   5.334 1.71e-07 ***
## temp                             31.9733     2.3927  13.363  < 2e-16 ***
## hum                              -6.0419     2.3220  -2.602  0.00966 ** 
## windspeed                       -14.0995     3.5681  -3.951 9.37e-05 ***
## as.factor(weathersit)Weather_2   -2.1217     0.6656  -3.188  0.00156 ** 
## as.factor(weathersit)Weather_3   -8.7984     1.4791  -5.948 6.49e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.78 on 355 degrees of freedom
## Multiple R-squared:    0.8,  Adjusted R-squared:  0.7949 
## F-statistic: 157.8 on 9 and 355 DF,  p-value: < 2.2e-16
summary(full_model3)
## 
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) + 
##     as.factor(holiday) + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2056.46  -261.73    31.05   332.27  1012.49 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      7.420e+02  1.788e+02   4.150 4.18e-05 ***
## casual                           1.397e+00  8.174e-02  17.091  < 2e-16 ***
## as.factor(workingday)Workingday  9.643e+02  7.678e+01  12.559  < 2e-16 ***
## as.factor(season)Spring          7.145e+02  9.257e+01   7.719 1.22e-13 ***
## as.factor(season)Summer          7.446e+02  1.189e+02   6.261 1.11e-09 ***
## as.factor(season)Fall            1.253e+03  8.075e+01  15.516  < 2e-16 ***
## as.factor(holiday)1             -1.606e+02  1.523e+02  -1.055  0.29216    
## temp                             2.152e+03  2.599e+02   8.282 2.54e-15 ***
## hum                             -5.233e+02  2.239e+02  -2.337  0.02001 *  
## windspeed                       -1.119e+03  3.487e+02  -3.208  0.00146 ** 
## as.factor(weathersit)Weather_2  -1.805e+02  6.425e+01  -2.810  0.00523 ** 
## as.factor(weathersit)Weather_3  -1.243e+03  1.438e+02  -8.646  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 457.6 on 353 degrees of freedom
## Multiple R-squared:  0.8932, Adjusted R-squared:  0.8899 
## F-statistic: 268.3 on 11 and 353 DF,  p-value: < 2.2e-16
#plot added variables
avPlots(full_model1)
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear

avPlots(full_model2)

avPlots(full_model3)

#kaylee
stepwise_model1 <- step(full_model1, direction = "both")
## Start:  AIC=4511.35
## registered ~ as.factor(workingday) + as.factor(season) + as.factor(holiday) + 
##     as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit)
## 
## 
## Step:  AIC=4511.35
## registered ~ as.factor(workingday) + as.factor(season) + as.factor(weekday) + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - as.factor(weekday)     6   1278049  79267291 4505.3
## <none>                                77989242 4511.3
## - hum                    1   1543574  79532816 4516.5
## - windspeed              1   3914363  81903605 4527.2
## - as.factor(workingday)  1   6617374  84606616 4539.1
## - as.factor(weathersit)  2  20032722  98021964 4590.8
## - temp                   1  29941742 107930985 4627.9
## - as.factor(season)      3  60480699 138469941 4714.9
## 
## Step:  AIC=4505.28
## registered ~ as.factor(workingday) + as.factor(season) + temp + 
##     hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## <none>                                79267291 4505.3
## + as.factor(holiday)     1    416658  78850634 4505.4
## - hum                    1   1666159  80933450 4510.9
## + as.factor(weekday)     6   1278049  77989242 4511.3
## - windspeed              1   3787445  83054736 4520.3
## - as.factor(weathersit)  2  19539168  98806460 4581.7
## - temp                   1  29890751 109158042 4620.1
## - as.factor(workingday)  1  39726359 118993650 4651.6
## - as.factor(season)      3  60612796 139880087 4706.6
stepwise_model2 <- step(full_model2, direction = "both")
## Start:  AIC=1151.88
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + temp + 
##     hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq     RSS    AIC
## <none>                                8110.7 1151.9
## - hum                    1     154.7  8265.4 1156.8
## - windspeed              1     356.7  8467.5 1165.6
## - as.factor(weathersit)  2     829.7  8940.4 1183.4
## - as.factor(season)      3    1450.8  9561.5 1206.0
## - temp                   1    4079.6 12190.3 1298.6
## - as.factor(workingday)  1    9562.9 17673.6 1434.2
stepwise_model3 <- step(full_model3, direction = "both")
## Start:  AIC=4483.75
## cnt ~ casual + as.factor(workingday) + as.factor(season) + as.factor(holiday) + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - as.factor(holiday)     1    233037  74144922 4482.9
## <none>                                73911885 4483.7
## - hum                    1   1143264  75055150 4487.4
## - windspeed              1   2154800  76066685 4492.2
## - temp                   1  14362298  88274183 4546.6
## - as.factor(weathersit)  2  15748723  89660608 4550.3
## - as.factor(workingday)  1  33025598 106937483 4616.6
## - as.factor(season)      3  54516542 128428427 4679.4
## - casual                 1  61161580 135073465 4701.8
## 
## Step:  AIC=4482.9
## cnt ~ casual + as.factor(workingday) + as.factor(season) + temp + 
##     hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## <none>                                74144922 4482.9
## + as.factor(holiday)     1    233037  73911885 4483.7
## - hum                    1   1087039  75231961 4486.2
## - windspeed              1   2121892  76266814 4491.2
## - temp                   1  14190777  88335699 4544.8
## - as.factor(weathersit)  2  15737193  89882115 4549.2
## - as.factor(workingday)  1  36139539 110284461 4625.8
## - as.factor(season)      3  54380050 128524972 4677.7
## - casual                 1  62035815 136180737 4702.8
summary(stepwise_model1)
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) + 
##     temp + hum + windspeed + as.factor(weathersit), data = training_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1794.6  -246.9    47.2   311.4  1056.3 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       985.06     174.62   5.641 3.46e-08 ***
## as.factor(workingday)Workingday   716.94      53.75  13.338  < 2e-16 ***
## as.factor(season)Spring           819.04      93.12   8.796  < 2e-16 ***
## as.factor(season)Summer           805.89     122.17   6.596 1.53e-10 ***
## as.factor(season)Fall            1310.16      82.49  15.883  < 2e-16 ***
## temp                             2736.84     236.54  11.570  < 2e-16 ***
## hum                              -627.06     229.55  -2.732 0.006617 ** 
## windspeed                       -1452.78     352.74  -4.119 4.75e-05 ***
## as.factor(weathersit)Weather_2   -221.52      65.80  -3.367 0.000844 ***
## as.factor(weathersit)Weather_3  -1367.32     146.23  -9.351  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 472.5 on 355 degrees of freedom
## Multiple R-squared:  0.8062, Adjusted R-squared:  0.8013 
## F-statistic: 164.1 on 9 and 355 DF,  p-value: < 2.2e-16
summary(stepwise_model2)
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) + 
##     temp + hum + windspeed + as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5673  -2.9959  -0.0408   2.7551  15.8412 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      19.9546     1.7664  11.297  < 2e-16 ***
## as.factor(workingday)Workingday -11.1234     0.5437 -20.459  < 2e-16 ***
## as.factor(season)Spring           6.3291     0.9419   6.719 7.29e-11 ***
## as.factor(season)Summer           3.4574     1.2358   2.798  0.00543 ** 
## as.factor(season)Fall             4.4509     0.8344   5.334 1.71e-07 ***
## temp                             31.9733     2.3927  13.363  < 2e-16 ***
## hum                              -6.0419     2.3220  -2.602  0.00966 ** 
## windspeed                       -14.0995     3.5681  -3.951 9.37e-05 ***
## as.factor(weathersit)Weather_2   -2.1217     0.6656  -3.188  0.00156 ** 
## as.factor(weathersit)Weather_3   -8.7984     1.4791  -5.948 6.49e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.78 on 355 degrees of freedom
## Multiple R-squared:    0.8,  Adjusted R-squared:  0.7949 
## F-statistic: 157.8 on 9 and 355 DF,  p-value: < 2.2e-16
summary(stepwise_model3)
## 
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) + 
##     temp + hum + windspeed + as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2060.85  -257.74    44.82   332.76  1011.50 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      7.189e+02  1.775e+02   4.050 6.29e-05 ***
## casual                           1.403e+00  8.153e-02  17.210  < 2e-16 ***
## as.factor(workingday)Workingday  9.826e+02  7.480e+01  13.136  < 2e-16 ***
## as.factor(season)Spring          7.157e+02  9.258e+01   7.730 1.12e-13 ***
## as.factor(season)Summer          7.474e+02  1.189e+02   6.285 9.65e-10 ***
## as.factor(season)Fall            1.252e+03  8.076e+01  15.501  < 2e-16 ***
## temp                             2.135e+03  2.594e+02   8.231 3.60e-15 ***
## hum                             -5.094e+02  2.236e+02  -2.278  0.02331 *  
## windspeed                       -1.110e+03  3.486e+02  -3.183  0.00159 ** 
## as.factor(weathersit)Weather_2  -1.840e+02  6.418e+01  -2.867  0.00439 ** 
## as.factor(weathersit)Weather_3  -1.243e+03  1.438e+02  -8.645  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 457.7 on 354 degrees of freedom
## Multiple R-squared:  0.8928, Adjusted R-squared:  0.8898 
## F-statistic:   295 on 10 and 354 DF,  p-value: < 2.2e-16
avPlots(stepwise_model1)

avPlots(stepwise_model2)

avPlots(stepwise_model3)

vif(stepwise_model1)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019139  1        1.009524
## as.factor(season)     3.632998  3        1.239874
## temp                  3.278830  1        1.810754
## hum                   1.900579  1        1.378615
## windspeed             1.199210  1        1.095085
## as.factor(weathersit) 1.819527  2        1.161421
vif(stepwise_model2)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019139  1        1.009524
## as.factor(season)     3.632998  3        1.239874
## temp                  3.278830  1        1.810754
## hum                   1.900579  1        1.378615
## windspeed             1.199210  1        1.095085
## as.factor(weathersit) 1.819527  2        1.161421
vif(stepwise_model3)
##                           GVIF Df GVIF^(1/(2*Df))
## casual                3.575021  1        1.890773
## as.factor(workingday) 2.104307  1        1.450623
## as.factor(season)     3.879645  3        1.253522
## temp                  4.203539  1        2.050253
## hum                   1.922350  1        1.386488
## windspeed             1.248688  1        1.117447
## as.factor(weathersit) 1.881977  2        1.171261

Validation

validation_data$predicted_registered <- predict(stepwise_model1, newdata = validation_data)
validation_data$predicted_sqrtcasual <- predict(stepwise_model2, newdata = validation_data)
validation_data$predicted_count <- predict(stepwise_model3, newdata = validation_data)

Note: In the third question, it produces Nans???

observed_values1 <- validation_data$registered
predicted_values1 <- validation_data$predicted_registered
# Calculate different prediction performance metrics
# Functions from the MLmetrics package
# Common regression metrics
# Calculate the Root Mean Squared Error (RMSE), which measures the ...
# ... average magnitude of prediction errors.
# Lower is better.
rmse1 <- RMSE(predicted_values1, observed_values1)
# same as
# sqrt(mean((observed_values - predicted_values)^2))

# Compute the Mean Absolute Error (MAE), indicating the average absolute ...
# ... difference between predicted and observed values.
# Lower is better.
mae1 <- MAE(predicted_values1, observed_values1)
# same as
# mean(abs(observed_values - predicted_values))

# Calculate the Mean Absolute Percentage Error (MAPE), measuring the ...
# ... average percentage difference between predicted and observed values.
mape1 <- MAPE(predicted_values1, observed_values1)
# same as
# mean(abs(predicted_values-observed_values)/observed_values)

# Determine the R-squared (R²) Score, representing the proportion of the ...
# ... variance in the observed values (of validation data set) ... 
# ... explained by the predicted values from the model.
# Higher is better.
r_squared1 <- R2_Score(predicted_values1, observed_values1)
# same as
# summary(lm(observed_values ~ predicted_values))$r.squared

# Display the calculated metrics
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1931.118
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 1799.516
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: -0.8432
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6396
#second question
observed_values2 <- sqrt(validation_data$casual) #viki: should I transform it here?? kaylee: I think so
predicted_values2 <- validation_data$predicted_sqrtcasual
rmse2 <- RMSE(predicted_values2, observed_values2)
mae2 <- MAE(predicted_values2, observed_values2)
mape2 <- MAPE(predicted_values2, observed_values2)
r_squared2 <- R2_Score(predicted_values2, observed_values2)
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 7.6403
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 5.9599
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.5814
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2125
#third question
observed_values3 <- validation_data$count
predicted_values3 <- validation_data$predicted_count
rmse3 <- RMSE(predicted_values3, observed_values3)
mae3 <- MAE(predicted_values3, observed_values3)
mape3 <- MAPE(predicted_values3, observed_values3)
r_squared3 <- R2_Score(predicted_values3, observed_values3)
## Warning in mean.default(y_true): argument is not numeric or logical: returning
## NA
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): NaN
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): NaN
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: NaN
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): NaN

model1 R^2 = 80%, R^2 score = -0.84, RMSE = 1931

model2 R^2 = 79%, R^2 score = 0.5814, RMSE = 7.64

model3 R^3 = 89%, R^2 score = NaN, RMSE = NaN

not working for model 3??

Diagnostics of Training model

we choose model 2 because it has best RMSE and R^2 (it is done for whole 2011 year)

m.mlr <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season)+ temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)
diagnostics_df <- data.frame(
  Residuals = resid(m.mlr),
  Fitted_Values = fitted(m.mlr),
  Standardized_Residuals = rstandard(m.mlr),
  Leverage = hatvalues(m.mlr)
  #Date = year2011$dteday
)

# Create the standardized residuals vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = Residuals)) +
  geom_point(col="blue", alpha=0.75) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "Residuals") +
  theme_bw()

# Create the QQ plot
ggplot(diagnostics_df, aes(sample = Standardized_Residuals)) +
  stat_qq(aes(sample = Standardized_Residuals), distribution = qnorm,
          size = 2, col="blue", alpha = 0.75) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Standardized Residual QQ Plot",
       x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_bw()

# Create the sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = sqrt(abs(Standardized_Residuals)))) +
  geom_point(col="blue", alpha=0.75) +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
  theme_bw()

# Leverage vs Standardized Residuals
ggplot(diagnostics_df, aes(x = Leverage, y = Standardized_Residuals)) +
  geom_point(alpha = 0.75) +
  labs(title = "Standardized Residuals vs. Leverage Plot",
       x = "Leverage", y = "Standardized Residuals") +
  theme_bw()

Prediction

validation2012<- subset(validation_data)
validation2012$predicted_casual <- predict(m.mlr, newdata = validation2012)
summary(validation2012$predicted_casual)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -1.625  18.851  25.486  24.806  29.947  45.683
# Extract observed and predicted values
observed_values <- sqrt(validation2012$casual)
predicted_values <- validation2012$predicted_casual
rmse <- RMSE(predicted_values, observed_values)
mae <- MAE(predicted_values, observed_values)
mape <- MAPE(predicted_values, observed_values)
r_squared <- R2_Score(predicted_values, observed_values)


cat("Root Mean Squared Error (RMSE):", round(rmse, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 7.6403
cat("Mean Absolute Error (MAE):", round(mae, digits = 4), "\n")
## Mean Absolute Error (MAE): 5.9599
cat("R-squared (R^2) Score:", round(r_squared, digits = 4), "\n")
## R-squared (R^2) Score: 0.5814
cat("Mean Absolute Percentage Error (MPE):", round(mape, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2125
ggplot(validation2012, aes(x = observed_values, y = predicted_values)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(x = "Observed Values", y = "Predicted Values",
       title = "Observed vs. Predicted Values") +
  theme_bw()

# Residuals plot
ggplot(validation2012, aes(x = 1:nrow(validation2012), y = observed_values-predicted_values)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
  labs(x = "Observation Index", y = "Residuals",
       title = "Observed vs. Predicted Values") +
  theme_bw()

Fit AR(1) Model for the model2 we chose

head(training_data$dteday)
## [1] "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" "2011-01-05"
## [6] "2011-01-06"
# Create a new data frame 'tempdataset' by selecting specific columns from 'training_data'
# For 'Date' column: training_data$dteday contains the date in "dd/mm/yyyy" format.
# We use the as.Date function and specify the format as "%d/%m/%Y" to match ...
tempdataset <- data.frame(Date = as.Date(training_data$dteday, format = "%d/%m/%Y"),
                          CasualUsers = training_data$casual,
                          WorkingDay = training_data$workingday,
                          Season = training_data$season,
                          Temperature = training_data$temp,
                          Humidity = training_data$hum,
                          WindSpeed = training_data$windspeed,
                          Weathersit = training_data$weathersit)

# Ensure there are no missing values
Dataset <- tempdataset[complete.cases(tempdataset), ]

# Remove the 'tempdataset' data frame from the R environment
rm(tempdataset)

# Add a square root transformation of the CasualUsers column
Dataset$SqrtCasualUsers <- sqrt(Dataset$CasualUsers)

# Fit the model
m.mlr <- lm(SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) + 
            Temperature + Humidity + WindSpeed + as.factor(Weathersit), 
            data = Dataset)

# Check the model's summary
summary(m.mlr)
## 
## Call:
## lm(formula = SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) + 
##     Temperature + Humidity + WindSpeed + as.factor(Weathersit), 
##     data = Dataset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5673  -2.9959  -0.0408   2.7551  15.8412 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      19.9546     1.7664  11.297  < 2e-16 ***
## as.factor(WorkingDay)Workingday -11.1234     0.5437 -20.459  < 2e-16 ***
## as.factor(Season)Spring           6.3291     0.9419   6.719 7.29e-11 ***
## as.factor(Season)Summer           3.4574     1.2358   2.798  0.00543 ** 
## as.factor(Season)Fall             4.4509     0.8344   5.334 1.71e-07 ***
## Temperature                      31.9733     2.3927  13.363  < 2e-16 ***
## Humidity                         -6.0419     2.3220  -2.602  0.00966 ** 
## WindSpeed                       -14.0995     3.5681  -3.951 9.37e-05 ***
## as.factor(Weathersit)Weather_2   -2.1217     0.6656  -3.188  0.00156 ** 
## as.factor(Weathersit)Weather_3   -8.7984     1.4791  -5.948 6.49e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.78 on 355 degrees of freedom
## Multiple R-squared:    0.8,  Adjusted R-squared:  0.7949 
## F-statistic: 157.8 on 9 and 355 DF,  p-value: < 2.2e-16
library(ggplot2)
# Time series modeling
# Create a variable indicating observations ordered by time
Dataset$Time_Seq <- order(Dataset$Date)

# Explore the correlation between GroundCO and Time
ggplot(Dataset, aes(x = Time_Seq, y = SqrtCasualUsers)) +
  geom_point(color = 'blue', alpha = 0.75) +
  geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
  labs(x = "Time", y = "SqrtCasualUsers", title = "Time vs. SqrtCasualUsers") +
  theme_bw()

# Perform Multiple Linear Regression with Time_Seq as a predictor
m.mlr.time <- lm(SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) + 
            Temperature + Humidity + WindSpeed + as.factor(Weathersit) + Time_Seq, data = Dataset)

# Examine themodel fit for m.mlr.time
summary(m.mlr.time)
## 
## Call:
## lm(formula = SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) + 
##     Temperature + Humidity + WindSpeed + as.factor(Weathersit) + 
##     Time_Seq, data = Dataset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5234  -2.9870  -0.0344   2.7588  15.7924 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      20.020403   1.775593  11.275  < 2e-16 ***
## as.factor(WorkingDay)Workingday -11.125996   0.544372 -20.438  < 2e-16 ***
## as.factor(Season)Spring           6.373544   0.949130   6.715 7.50e-11 ***
## as.factor(Season)Summer           3.647400   1.319751   2.764  0.00601 ** 
## as.factor(Season)Fall             4.839480   1.257157   3.850  0.00014 ***
## Temperature                      32.098287   2.414520  13.294  < 2e-16 ***
## Humidity                         -5.926959   2.341291  -2.531  0.01179 *  
## WindSpeed                       -14.141652   3.573771  -3.957 9.17e-05 ***
## as.factor(Weathersit)Weather_2   -2.150941   0.670078  -3.210  0.00145 ** 
## as.factor(Weathersit)Weather_3   -8.820967   1.481868  -5.953 6.35e-09 ***
## Time_Seq                         -0.001829   0.004422  -0.414  0.67937    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.785 on 354 degrees of freedom
## Multiple R-squared:  0.8001, Adjusted R-squared:  0.7944 
## F-statistic: 141.7 on 10 and 354 DF,  p-value: < 2.2e-16
# Time Series: Autocorrelation function
# We shall now study the correlation of the response with respect to time
#   - Dataset$SqrtCasualUsers: Time series data for which autocorrelation is calculated
#   - lag.max: Maximum number of lags to compute autocorrelation (default is often set to 10)
#   - plot: Boolean, whether to plot the autocorrelation function (TRUE or FALSE)
acf(Dataset$SqrtCasualUsers, lag.max = 100, plot = TRUE)

acf(residuals(m.mlr.time)[Dataset$Time_Seq], lag.max = 100, plot = TRUE)

In summary, this ACF plot suggests that there is some autocorrelation in the early lags of the SqrtCasualUsers time series, but it quickly diminishes, implying that recent past values (especially the immediate last value) are somewhat informative in predicting the current value, but this influence diminishes as you go further back in time. We will consider GLS with AR(1) because we have multiple predictors, and there may be autocorrelation in the residuals of a multiple regression model, we will use a GLS model with AR(1) correlation.

GLS model with AR(1)

#install.packages("nlme")
library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
# Run Generalized Least Squares (GLS) with corAR1 model (Auto-Regressive 1)
# Note: This might take some time, depending on your computer and memory
m.gls <- gls(SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) + 
              Temperature + Humidity + WindSpeed + as.factor(Weathersit),
              data = Dataset,
              correlation = corAR1(form = ~ Time_Seq),
              method = "ML")  # Maximum Likelihood estimation

# Display summary information about the GLS model
summary(m.gls)
## Generalized least squares fit by maximum likelihood
##   Model: SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) +      Temperature + Humidity + WindSpeed + as.factor(Weathersit) 
##   Data: Dataset 
##        AIC      BIC    logLik
##   2106.345 2153.144 -1041.173
## 
## Correlation Structure: AR(1)
##  Formula: ~Time_Seq 
##  Parameter estimate(s):
##       Phi 
## 0.4700105 
## 
## Coefficients:
##                                      Value Std.Error    t-value p-value
## (Intercept)                      21.184330  1.837644  11.527985  0.0000
## as.factor(WorkingDay)Workingday -10.084272  0.502708 -20.059904  0.0000
## as.factor(Season)Spring           6.645981  1.406163   4.726325  0.0000
## as.factor(Season)Summer           4.382162  1.758259   2.492330  0.0131
## as.factor(Season)Fall             4.503845  1.271533   3.542058  0.0004
## Temperature                      29.864538  3.206705   9.313155  0.0000
## Humidity                         -7.567471  2.174009  -3.480884  0.0006
## WindSpeed                       -15.323345  3.139098  -4.881449  0.0000
## as.factor(Weathersit)Weather_2   -2.260544  0.570305  -3.963743  0.0001
## as.factor(Weathersit)Weather_3   -8.500411  1.304037  -6.518536  0.0000
## 
##  Correlation: 
##                                 (Intr) a.(WD) as.fctr(Ssn)Sp as.fctr(Ssn)Sm
## as.factor(WorkingDay)Workingday -0.163                                     
## as.factor(Season)Spring          0.029  0.003                              
## as.factor(Season)Summer          0.111  0.019  0.702                       
## as.factor(Season)Fall           -0.096  0.008  0.583          0.589        
## Temperature                     -0.388 -0.050 -0.563         -0.747        
## Humidity                        -0.650  0.014 -0.032          0.014        
## WindSpeed                       -0.461  0.029  0.039          0.099        
## as.factor(Weathersit)Weather_2   0.306 -0.059 -0.016         -0.009        
## as.factor(Weathersit)Weather_3   0.284 -0.058 -0.014         -0.041        
##                                 a.(S)F Tmprtr Humdty WndSpd a.(W)W_2
## as.factor(WorkingDay)Workingday                                     
## as.factor(Season)Spring                                             
## as.factor(Season)Summer                                             
## as.factor(Season)Fall                                               
## Temperature                     -0.360                              
## Humidity                        -0.086 -0.114                       
## WindSpeed                        0.122 -0.072  0.215                
## as.factor(Weathersit)Weather_2   0.026  0.080 -0.552 -0.184         
## as.factor(Weathersit)Weather_3  -0.043  0.085 -0.432 -0.218  0.437  
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -3.45133400 -0.64465501 -0.01827247  0.62272981  3.45941760 
## 
## Residual standard error: 4.749606 
## Degrees of freedom: 365 total; 355 residual
# Diagnostics and plots for the time series model
# Autocorrelation plot of residuals
acf(residuals(m.gls))

# Plot of fitted values against observed values
plot(fitted(m.gls), Dataset$SqrtCasualUsers, 
     xlab = "Fitted Values", ylab = "Observed Values",
     main = "Fitted vs. Observed Values")

# Plot of residuals over time
plot(Dataset$Time_Seq, residuals(m.gls), 
     xlab = "Time", ylab = "Residuals",
     main = "Residuals over Time")

#full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season)+ temp + hum + windspeed + as.factor(weathersit),
#                 data = training_data)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:nlme':
## 
##     getResponse

For Regression Model we got from Report 3

Model Fitting & obtain outputs of the model

model1 <- lm(registered ~ as.factor(workingday)+ as.factor(weathersit) + temp + windspeed,
             data = training_data)
summary(model1) #r^2 = 0.6532 
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(weathersit) + 
##     temp + windspeed, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1804.28  -451.16   -70.39   515.08  1394.11 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      1065.70     137.48   7.752 9.41e-14 ***
## as.factor(workingday)Workingday   704.89      70.89   9.943  < 2e-16 ***
## as.factor(weathersit)Weather_2   -292.24      70.28  -4.158 4.01e-05 ***
## as.factor(weathersit)Weather_3  -1207.67     168.55  -7.165 4.44e-12 ***
## temp                             3606.33     174.57  20.659  < 2e-16 ***
## windspeed                       -2227.25     431.26  -5.165 4.00e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 624.3 on 359 degrees of freedom
## Multiple R-squared:  0.658,  Adjusted R-squared:  0.6532 
## F-statistic: 138.1 on 5 and 359 DF,  p-value: < 2.2e-16
#model2
model2 <- lm(sqrt(casual)~as.factor(season) + as.factor(weathersit) + as.factor(workingday),data = training_data)
summary(model2) #r^2 = 0.6873 
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(season) + as.factor(weathersit) + 
##     as.factor(workingday), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.9963  -3.5717   0.0082   3.6853  19.3328 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      22.5236     0.7917  28.449  < 2e-16 ***
## as.factor(season)Spring          14.3744     0.8760  16.409  < 2e-16 ***
## as.factor(season)Summer          17.1367     0.8728  19.633  < 2e-16 ***
## as.factor(season)Fall             9.4982     0.8887  10.688  < 2e-16 ***
## as.factor(weathersit)Weather_2   -3.6307     0.6656  -5.455 9.18e-08 ***
## as.factor(weathersit)Weather_3  -12.5801     1.5961  -7.882 3.93e-14 ***
## as.factor(workingday)Workingday -10.6997     0.6697 -15.977  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.902 on 358 degrees of freedom
## Multiple R-squared:  0.6925, Adjusted R-squared:  0.6873 
## F-statistic: 134.4 on 6 and 358 DF,  p-value: < 2.2e-16
#model3
model3 <- lm(cnt ~  temp + windspeed + hum  + as.factor(workingday),data = training_data)
summary(model3) #r^2 = 0.6511 
## 
## Call:
## lm(formula = cnt ~ temp + windspeed + hum + as.factor(workingday), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2891.63  -500.88   -34.64   651.15  1940.44 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2413.11     264.78   9.114  < 2e-16 ***
## temp                             5592.41     228.62  24.461  < 2e-16 ***
## windspeed                       -4021.07     570.77  -7.045 9.48e-12 ***
## hum                             -1467.65     296.35  -4.952 1.13e-06 ***
## as.factor(workingday)Workingday   -21.41      91.92  -0.233    0.816    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 814.4 on 360 degrees of freedom
## Multiple R-squared:  0.655,  Adjusted R-squared:  0.6511 
## F-statistic: 170.8 on 4 and 360 DF,  p-value: < 2.2e-16
#Model Diagnostics: to assess assumptions and multicollinearity

#!!!Model1!!!
# Plotting added variable plots
avPlots(model1)

# Check for multicollinearity: computing variance inflation factors (VIFs)
vif(model1)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.015839  1        1.007888
## as.factor(weathersit) 1.035705  2        1.008809
## temp                  1.023139  1        1.011503
## windspeed             1.027020  1        1.013420
# Create a data frame with the residuals and fitted values
diagnostics_df1 <- data.frame(Residuals1 = resid(model1),
                             Fitted_Values1 = fitted(model1),
                             Standardized_Residuals1 = rstandard(model1),
                             Leverage1 = hatvalues(model1),
                             Season = training_data$season,
                             Workingday = training_data$workingday,
                             Weather = training_data$weathersit)

#!!!Model2!!!
avPlots(model2)

vif(model2)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(season)     1.034514  3        1.005671
## as.factor(weathersit) 1.046385  2        1.011400
## as.factor(workingday) 1.014109  1        1.007030
diagnostics_df2 <- data.frame(Residuals2 = resid(model2),
                              Fitted_Values2 = fitted(model2),
                              Standardized_Residuals2 = rstandard(model2),
                              Leverage2 = hatvalues(model2),
                              Season = training_data$season,
                            Workingday = training_data$workingday,
                             Weather = training_data$weathersit)

#!!!Model3!!!
avPlots(model3)

vif(model3)
##                  temp             windspeed                   hum 
##              1.031276              1.057145              1.066509 
## as.factor(workingday) 
##              1.003524
diagnostics_df3 <- data.frame(Residuals3 = resid(model3),
                              Fitted_Values3 = fitted(model3),
                              Standardized_Residuals3 = rstandard(model3),
                              Leverage3 = hatvalues(model3),
                              Season = training_data$season,
                              Workingday = training_data$workingday,
                             Weather = training_data$weathersit)

plot the diagnostics

#!!!Model1!!!
# Create the standardized residuals vs. fitted values plot
ggplot(diagnostics_df1, aes(x = Fitted_Values1, y = Residuals1)) +
  geom_point(col="blue", alpha=0.75) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "Residuals") +
  theme_bw()

# Create the QQ plot
ggplot(diagnostics_df1, aes(sample = Standardized_Residuals1)) +
  stat_qq(aes(sample = Standardized_Residuals1), distribution = qnorm,
          size = 2, col="blue", alpha = 0.75) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Standardized Residual QQ Plot",
       x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_bw()

# Create the sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df1, aes(x = Fitted_Values1, y = sqrt(abs(Standardized_Residuals1)))) +
  geom_point(col="blue", alpha=0.75) +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
  theme_bw()

# Leverage vs Standardized Residuals
ggplot(diagnostics_df1, aes(x = Leverage1, y = Standardized_Residuals1)) +
  geom_point(alpha = 0.75) +
  labs(title = "Standardized Residuals vs. Leverage Plot",
       x = "Leverage", y = "Standardized Residuals") +
  theme_bw()

#!!!Model2!!!
#residual vs fitted values
ggplot(diagnostics_df2, aes(x = Fitted_Values2, y = Residuals2)) +
  geom_point(col="blue", alpha=0.75) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "Residuals") +
  theme_bw()

#QQ plot
ggplot(diagnostics_df2, aes(sample = Standardized_Residuals2)) +
  stat_qq(aes(sample = Standardized_Residuals2), distribution = qnorm,
          size = 2, col="blue", alpha = 0.75) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Standardized Residual QQ Plot",
       x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_bw()

#sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df2, aes(x = Fitted_Values2, y = sqrt(abs(Standardized_Residuals2)))) +
  geom_point(col="blue", alpha=0.75) +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
  theme_bw()

# Leverage vs Standardized Residuals
ggplot(diagnostics_df2, aes(x = Leverage2, y = Standardized_Residuals2)) +
  geom_point(alpha = 0.75) +
  labs(title = "Standardized Residuals vs. Leverage Plot",
       x = "Leverage", y = "Standardized Residuals") +
  theme_bw()

#!!!Model3!!!
#residual vs fitted values
ggplot(diagnostics_df3, aes(x = Fitted_Values3, y = Residuals3)) +
  geom_point(col="blue", alpha=0.75) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "Residuals") +
  theme_bw()

#QQ plot
ggplot(diagnostics_df3, aes(sample = Standardized_Residuals3)) +
  stat_qq(aes(sample = Standardized_Residuals3), distribution = qnorm,
          size = 2, col="blue", alpha = 0.75) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Standardized Residual QQ Plot",
       x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_bw()

#sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df3, aes(x = Fitted_Values3, y = sqrt(abs(Standardized_Residuals3)))) +
  geom_point(col="blue", alpha=0.75) +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
  theme_bw()

# Leverage vs Standardized Residuals
ggplot(diagnostics_df3, aes(x = Leverage3, y = Standardized_Residuals3)) +
  geom_point(alpha = 0.75) +
  labs(title = "Standardized Residuals vs. Leverage Plot",
       x = "Leverage", y = "Standardized Residuals") +
  theme_bw()

#Prediction: training and validation

#Model1 - training

#model1
training_data$Predicted_registered <- predict(model1, training_data)
# Extract observed and predicted values
observed_values1 <- training_data$registered
predicted_values1 <- training_data$Predicted_registered
# Calculate different prediction performance metrics
#rmse
rmse1 <- RMSE(predicted_values1, observed_values1)
#mae
mae1 <- MAE(predicted_values1, observed_values1)
#mape
mape1 <- MAPE(predicted_values1, observed_values1)
#r^2
r_squared1 <- R2_Score(predicted_values1, observed_values1)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 619.1167
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 519.1348
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: 0.658
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2562
# Root Mean Squared Error (RMSE): 619.1167 
# Mean Absolute Error (MAE): 519.1348 
# R-squared (R^2) Score: 0.658 
# Mean Absolute Percentage Error (MPE): 0.2562 

#Model1 - validation

validation_data$Predicted_registered <- predict(model1, validation_data)
# Extract observed and predicted values
observed_values1 <- validation_data$registered
predicted_values1 <- validation_data$Predicted_registered
# Calculate different prediction performance metrics
#rmse
rmse1 <- RMSE(predicted_values1, observed_values1)
#mae
mae1 <- MAE(predicted_values1, observed_values1)
#mape
mape1 <- MAPE(predicted_values1, observed_values1)
#r^2
r_squared1 <- R2_Score(predicted_values1, observed_values1)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1993.671
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 1805.783
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: -0.9646
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.5715
# Root Mean Squared Error (RMSE): 1993.671 
# Mean Absolute Error (MAE): 1805.783 
# R-squared (R^2) Score: -0.9646 
# Mean Absolute Percentage Error (MPE): 0.5715 

#Model2 - training

# Extract observed and predicted values
sqrt_casual <- sqrt(training_data$casual)
observed_values2 <- sqrt_casual
training_data$Predicted_sqrt_casual <- predict(model2, training_data)
predicted_values2 <- training_data$Predicted_sqrt_casual 

#rmse
rmse2 <- RMSE(predicted_values2, observed_values2)
#mae
mae2 <- MAE(predicted_values2, observed_values2)
#mape
mape2 <- MAPE(predicted_values2, observed_values2)
#r^2
r_squared2 <- R2_Score(predicted_values2, observed_values2)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 5.8451
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 4.5223
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.6925
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2555
# Root Mean Squared Error (RMSE): 5.8451 
# Mean Absolute Error (MAE): 4.5223 
# R-squared (R^2) Score: 0.6925 
# Mean Absolute Percentage Error (MPE): 0.2555 

#Model2 - validation

# Extract observed and predicted values
sqrt_casual <- sqrt(validation_data$casual)
observed_values2 <- sqrt_casual
validation_data$Predicted_sqrt_casual <- predict(model2, validation_data)
predicted_values2 <- validation_data$Predicted_sqrt_casual 

#rmse
rmse2 <- RMSE(predicted_values2, observed_values2)
#mae
mae2 <- MAE(predicted_values2, observed_values2)
#mape
mape2 <- MAPE(predicted_values2, observed_values2)
#r^2
r_squared2 <- R2_Score(predicted_values2, observed_values2)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 8.9667
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 6.9653
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.4235
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2525
# Root Mean Squared Error (RMSE): 8.9667 
# Mean Absolute Error (MAE): 6.9653 
# R-squared (R^2) Score: 0.4235 
# Mean Absolute Percentage Error (MPE): 0.2525 

#Model 3 - training

observed_values3 <- training_data$cnt
training_data$Predicted_cnt <- predict(model3, newdata = training_data)
predicted_values3 <- training_data$Predicted_cnt
#rmse
rmse3 <- RMSE(predicted_values3, observed_values3)
#mae
mae3 <- MAE(predicted_values3, observed_values3)
#mape
mape3 <- MAPE(predicted_values3, observed_values3)
#r^2
r_squared3 <- R2_Score(predicted_values2, observed_values2)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 808.7555
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): 649.4414
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: 0.4235
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2749
# Root Mean Squared Error (RMSE): 808.7555 
# Mean Absolute Error (MAE): 649.4414 
# R-squared (R^2) Score: 0.4235 
# Mean Absolute Percentage Error (MPE): 0.2749 

#Model 3 - validation

observed_values3 <- validation_data$cnt
validation_data$Predicted_cnt <- predict(model3, newdata = validation_data)
predicted_values3 <- validation_data$Predicted_cnt
#rmse
rmse3 <- RMSE(predicted_values3, observed_values3)
#mae
mae3 <- MAE(predicted_values3, observed_values3)
#mape
mape3 <- MAPE(predicted_values3, observed_values3)
#r^2
r_squared3 <- R2_Score(predicted_values2, observed_values2)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 2373.45
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): 2119.981
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: 0.4235
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6334
# Root Mean Squared Error (RMSE): 2373.45 
# Mean Absolute Error (MAE): 2119.981 
# R-squared (R^2) Score: 0.4235 
# Mean Absolute Percentage Error (MPE): 0.6334